{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Speed comparison of gradient boosting libraries for shap values calculations"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here we compare CatBoost, LightGBM and XGBoost for shap values calculations. All boosting algorithms were trained on GPU but shap evaluation was on CPU.\n",
"\n",
"We use the epsilon_normalized dataset from [here](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/)."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import copy\n",
"import datetime\n",
"import os\n",
"\n",
"import catboost\n",
"import lightgbm as lgb\n",
"import numpy as np\n",
"import pandas as pd\n",
"import tqdm\n",
"import xgboost as xgb\n",
"from sklearn import datasets"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('0.11.2', '2.2.2', '0.81')"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"catboost.__version__, lgb.__version__, xgb.__version__"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"train_data, train_target = datasets.load_svmlight_file(\"epsilon_normalized\")\n",
"test_data, test_target = datasets.load_svmlight_file(\n",
" \"epsilon_normalized.t\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parameters"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"num_iters = 1000\n",
"lr = 0.1\n",
"max_bin = 128\n",
"gpu_device = \"0\" # specify your GPU (used only for training)\n",
"random_state = 0"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"train_target[train_target == -1] = 0\n",
"test_target[test_target == -1] = 0"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def preprocess_data(data, label=None, mode=\"train\", boosting=None):\n",
" assert boosting is not None\n",
"\n",
" if boosting == \"xgboost\":\n",
" return xgb.DMatrix(data, label)\n",
" elif boosting == \"lightgbm\":\n",
" if mode == \"train\":\n",
" return lgb.Dataset(data, label)\n",
" else:\n",
" return data\n",
" elif boosting == \"catboost\":\n",
" data = catboost.FeaturesData(num_feature_data=data)\n",
" return catboost.Pool(data, label)\n",
" else:\n",
" raise RuntimeError(\"Unknown boosting library\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def create_parameters(base_params, boosting=None, **kwargs):\n",
" assert boosting is not None\n",
" assert isinstance(base_params, dict)\n",
"\n",
" params = copy.copy(base_params)\n",
" if boosting == \"xgboost\":\n",
" params[\"objective\"] = \"binary:logistic\"\n",
" params[\"max_depth\"] = kwargs[\"depth\"]\n",
" params[\"tree_method\"] = \"gpu_hist\"\n",
" params[\"gpu_id\"] = gpu_device\n",
" elif boosting == \"lightgbm\":\n",
" params[\"objective\"] = \"binary\"\n",
" params[\"device\"] = \"gpu\"\n",
" params[\"gpu_device_id\"] = gpu_device\n",
" params[\"num_leaves\"] = 2 ** kwargs[\"depth\"]\n",
" elif boosting == \"catboost\":\n",
" params[\"objective\"] = \"Logloss\"\n",
" params[\"task_type\"] = \"GPU\"\n",
" params[\"devices\"] = gpu_device\n",
" params[\"bootstrap_type\"] = \"Bernoulli\"\n",
" params[\"logging_level\"] = \"Silent\"\n",
" else:\n",
" raise RuntimeError(\"Unknown boosting library\")\n",
"\n",
" return params"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def train(data, params, num_iters, boosting=None):\n",
" assert boosting is not None\n",
" if boosting == \"xgboost\":\n",
" return xgb.train(params=params, dtrain=data, num_boost_round=num_iters)\n",
" elif boosting == \"lightgbm\":\n",
" return lgb.train(params=params, train_set=data, num_boost_round=num_iters)\n",
" elif boosting == \"catboost\":\n",
" return catboost.train(pool=data, params=params, num_boost_round=num_iters)\n",
" else:\n",
" raise RuntimeError(\"Unknown boosting library\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def predict_shap(model, data, boosting=None):\n",
" assert boosting is not None\n",
" if boosting == \"xgboost\":\n",
" return model.predict(data, pred_contribs=True)\n",
" elif boosting == \"lightgbm\":\n",
" return model.predict(data, pred_contrib=True)\n",
" elif boosting == \"catboost\":\n",
" return model.get_feature_importance(data, fstr_type=\"ShapValues\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def create_path(boosting, params):\n",
" fname = [boosting]\n",
" for key, value in sorted(params.items()):\n",
" fname.append(str(key))\n",
" fname.append(str(value))\n",
" fname = \"_\".join(fname)\n",
" fname = fname.replace(\".\", \"\")\n",
" fname += \".model\"\n",
" return fname"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def load_model(fname, boosting):\n",
" if boosting == \"xgboost\":\n",
" bst = xgb.Booster(model_file=fname)\n",
" bst.load_model(fname)\n",
" elif boosting == \"lightgbm\":\n",
" bst = lgb.Booster(model_file=fname)\n",
" elif boosting == \"catboost\":\n",
" bst = catboost.CatBoost()\n",
" bst.load_model(fname)\n",
" else:\n",
" raise RuntimeError(\"Unknown boosting\")\n",
" return bst"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"base_params = {\"learning_rate\": lr, \"max_bin\": max_bin, \"random_state\": random_state}"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"result = []\n",
"\n",
"boosting_list = [\"xgboost\", \"catboost\", \"lightgbm\"]\n",
"depth_list = [2, 4, 6, 8, 10]\n",
"lens_list = [1000, 5000, 10000]\n",
"\n",
"\n",
"for gb_type in boosting_list:\n",
" print(f\"{gb_type} is going\")\n",
"\n",
" for size_test in lens_list:\n",
" print(f\"size test {size_test}\")\n",
" sep_test_data = test_data[:size_test]\n",
" sep_test_target = test_target[:size_test]\n",
"\n",
" # comment this line if you have already trained all models\n",
" train_preprocessed = preprocess_data(train_data, train_target, boosting=gb_type)\n",
"\n",
" dense_test = sep_test_data.todense().A.astype(np.float32)\n",
"\n",
" for depth in tqdm.tqdm(depth_list):\n",
" start_test_preproc = datetime.datetime.now()\n",
" test_preprocessed = preprocess_data(dense_test, sep_test_target, mode=\"test\", boosting=gb_type)\n",
"\n",
" finish_test_preproc = datetime.datetime.now()\n",
" preprocessing_delta = finish_test_preproc - start_test_preproc\n",
" preprocessing_delta = preprocessing_delta.total_seconds()\n",
"\n",
" params = create_parameters(base_params, boosting=gb_type, depth=depth)\n",
" params[\"depth\"] = depth\n",
" fname = create_path(gb_type, params)\n",
" if os.path.exists(fname):\n",
" print(\"model exist\")\n",
" bst = load_model(fname, boosting=gb_type)\n",
" else:\n",
" print(\"model is training\")\n",
" start_train = datetime.datetime.now()\n",
" bst = train(train_preprocessed, params, num_iters=num_iters, boosting=gb_type)\n",
" finish_train = datetime.datetime.now()\n",
" delta_train = finish_train - start_train\n",
" delta_train = int(delta_train.total_seconds() * 1000)\n",
" bst.save_model(fname)\n",
"\n",
" start_time = datetime.datetime.now()\n",
" preds = predict_shap(bst, test_preprocessed, boosting=gb_type)\n",
" assert preds.shape == (sep_test_data.shape[0], sep_test_data.shape[1] + 1)\n",
" finish_time = datetime.datetime.now()\n",
"\n",
" delta = finish_time - start_time\n",
" delta = delta.total_seconds()\n",
"\n",
" current_res = {\n",
" \"preprocessing_time\": preprocessing_delta,\n",
" \"boosting\": gb_type,\n",
" \"test_size\": size_test,\n",
" \"depth\": depth,\n",
" \"time\": delta,\n",
" }\n",
"\n",
" result.append(current_res)\n",
"\n",
" print(\"*\" * 40)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"result_df = pd.DataFrame(result)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"result_df.to_csv(f\"shap_benchmark_{max_bin}_max_bin_with_test_sizes.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" boosting | \n",
" catboost | \n",
" lightgbm | \n",
" xgboost | \n",
"
\n",
" \n",
" | test_size | \n",
" depth | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1000 | \n",
" 2 | \n",
" 0.311027 | \n",
" 0.090156 | \n",
" 0.112515 | \n",
"
\n",
" \n",
" | 4 | \n",
" 0.281931 | \n",
" 0.578531 | \n",
" 0.300671 | \n",
"
\n",
" \n",
" | 6 | \n",
" 0.464603 | \n",
" 4.159926 | \n",
" 1.468442 | \n",
"
\n",
" \n",
" | 8 | \n",
" 4.918599 | \n",
" 23.844245 | \n",
" 7.847191 | \n",
"
\n",
" \n",
" | 10 | \n",
" 93.152000 | \n",
" 119.527824 | \n",
" 30.872254 | \n",
"
\n",
" \n",
" | 5000 | \n",
" 2 | \n",
" 1.171963 | \n",
" 0.284673 | \n",
" 0.241316 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1.081119 | \n",
" 2.094985 | \n",
" 0.931881 | \n",
"
\n",
" \n",
" | 6 | \n",
" 1.319114 | \n",
" 20.624486 | \n",
" 6.498283 | \n",
"
\n",
" \n",
" | 8 | \n",
" 5.807985 | \n",
" 118.552238 | \n",
" 38.992395 | \n",
"
\n",
" \n",
" | 10 | \n",
" 95.049909 | \n",
" 601.251603 | \n",
" 153.408904 | \n",
"
\n",
" \n",
" | 10000 | \n",
" 2 | \n",
" 2.048301 | \n",
" 0.621454 | \n",
" 0.509722 | \n",
"
\n",
" \n",
" | 4 | \n",
" 2.263058 | \n",
" 4.291201 | \n",
" 1.935541 | \n",
"
\n",
" \n",
" | 6 | \n",
" 2.396371 | \n",
" 42.788038 | \n",
" 12.981580 | \n",
"
\n",
" \n",
" | 8 | \n",
" 7.078056 | \n",
" 240.614644 | \n",
" 77.883250 | \n",
"
\n",
" \n",
" | 10 | \n",
" 95.680684 | \n",
" 1189.685032 | \n",
" 306.529277 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"boosting catboost lightgbm xgboost\n",
"test_size depth \n",
"1000 2 0.311027 0.090156 0.112515\n",
" 4 0.281931 0.578531 0.300671\n",
" 6 0.464603 4.159926 1.468442\n",
" 8 4.918599 23.844245 7.847191\n",
" 10 93.152000 119.527824 30.872254\n",
"5000 2 1.171963 0.284673 0.241316\n",
" 4 1.081119 2.094985 0.931881\n",
" 6 1.319114 20.624486 6.498283\n",
" 8 5.807985 118.552238 38.992395\n",
" 10 95.049909 601.251603 153.408904\n",
"10000 2 2.048301 0.621454 0.509722\n",
" 4 2.263058 4.291201 1.935541\n",
" 6 2.396371 42.788038 12.981580\n",
" 8 7.078056 240.614644 77.883250\n",
" 10 95.680684 1189.685032 306.529277"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_df = pd.read_csv(\n",
" \"shap_benchmark_128_max_bin_with_test_sizes.csv\",\n",
")\n",
"result_df.pivot_table(index=[\"test_size\", \"depth\"], columns=\"boosting\", values=\"time\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | boosting | \n",
" catboost | \n",
" lightgbm | \n",
" xgboost | \n",
"
\n",
" \n",
" | test_size | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1000 | \n",
" 0.069569 | \n",
" 0.002816 | \n",
" 0.011025 | \n",
"
\n",
" \n",
" | 5000 | \n",
" 0.349831 | \n",
" 0.000006 | \n",
" 0.047836 | \n",
"
\n",
" \n",
" | 10000 | \n",
" 0.770179 | \n",
" 0.000006 | \n",
" 0.089032 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"boosting catboost lightgbm xgboost\n",
"test_size \n",
"1000 0.069569 0.002816 0.011025\n",
"5000 0.349831 0.000006 0.047836\n",
"10000 0.770179 0.000006 0.089032"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_df.pivot_table(index=\"test_size\", columns=\"boosting\", values=\"preprocessing_time\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}